

* +++++++++++++++++++++
* CLEAN APPRENTICESHIP DATA
* +++++++++++++++++++++

/*
* source: Arbeitsagentur
Bewerber und Berufsausbildungsstellen
https://statistik.arbeitsagentur.de/SiteGlobals/Forms/Suche/Einzelheftsuche_Formular.html?gtp=15084_list%253D4&topic_f=analyse
*/ 

global data_raw_lehrstellen "${data_raw}/apprenticeships/"

* clean up crosswalk from Arbeitsagentur to Kreis
* note: many Kreise will be assigned the same Arbeitsagentur
import excel "${data_raw}/geo/Zuordnung-Agenturen.xlsx", clear
ren (A D) (agency_number ags)
keep if _n >=7 & _n <=410
destring agency_number ags, replace force
keep agency_number ags
tempfile cw_ags
save `cw_ags'

* loop over files from different times
cd "${data_raw_lehrstellen}/"
local i = 1
foreach suff in xls xlsx {
local files : dir "${data_raw_lehrstellen}/" files "*.`suff'"

	foreach file in `files' {
		di "`file'"
		import excel "`file'", clear sheet("Tabelle 1 B")
		
		* give meaningful names and focus on main variables
		ren (A B E H) (agency_name open_positions applicants positions_per_applicant)
		keep agency_name open_positions applicants positions_per_applicant
		
		* clean up date
		gen date = agency_name[5]
		split date, parse(", ")
		split date1, parse(" ")
		drop date date2 date1
		ren (date11 date12) (month year)

		* focus on rows by Arbeitsagentur (i.e. drop totals)
		keep if strpos(agency_name, " AA ")!=0
		split agency_name, parse(" AA ")
		ren (agency_name1 agency_name2) (agency_number place_name)
		drop agency_name
		
		* clean up 
		destring open_positions applicants positions_per_applicant year agency_number, ///
			replace force

		tempfile file`i'
		save `file`i''
		local ++i 
	}
}

* combine various files (from different dates)
local --i 
use `file1', clear
forval j = 2/`i' {
	append using `file`j''
}

* drop the covid year 2020 because data incomplete
keep if year <=2019

* average over all months in the data
collapse (mean) open_positions applicants positions_per_applicant /// 
	(first) place_name, by(year agency_number)

* crosswalk to Kreis
joinby agency_number using `cw_ags'

* create logs 
foreach var in open_positions applicants positions_per_applicant {
	gen log_`var' = log(`var')
}

* clean up and save
order ags year agency_number place_name, first
sort  ags year agency_number

save "${data_derived}/positions_and_applicants_per_year_and_kreis.dta", replace

	
